# See also inline comments with code.
# System info.
version
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 3
## minor 3.2
## year 2016
## month 10
## day 31
## svn rev 71607
## language R
## version.string R version 3.3.2 (2016-10-31)
## nickname Sincere Pumpkin Patch
date()
## [1] "Tue Jan 10 18:18:17 2017"
# Load libraries.
require(car)
require(class)
require(colorspace)
require(cowplot)
require(GGally)
require(gbm)
require(glmnet)
require(gplots)
require(ggplot2)
require(ISLR)
require(leaps)
require(MASS)
require(randomForest)
require(tree)
# Set deliberate seed, so can reproduce results.
set.seed(100)
# Helper functions.
# This is just used to blat the workspace between questions - to help keep them self-contained.
# Less chance of a bug being caused by unwittingly using a variable from a previous question, etc.
clear_workspace <- function() {
rm(
list = ls(envir = .GlobalEnv)[!ls(envir = .GlobalEnv) %in% c("best_knn", "clear_workspace")],
envir = .GlobalEnv
)
}
# There's a lot of repetition in calls to various KNN models, primarily around looping over
# values of k and printing the confusion matrix for the best one. So this function just
# allows this to be factored out for re-use.
best_knn <- function(
maxk,
train.X,
test.X,
train.Y,
test.Y) {
err = vector(mode = "numeric", length = maxk)
for (k in 1 : maxk) {
knn.pred <- knn(
train = train.X,
test = test.X,
cl = train.Y,
k = k
)
err[k] <- mean(knn.pred != test.Y)
}
# Plot k vs err
print(ggplot() +
geom_point(
alpha = 0.4,
aes(
x = 1:maxk,
y = err
)
) +
ggtitle("Classification Error for different values of K") +
labs(
x = "k",
y = "error"
)
)
# Find the value of k which had the minimum error.
bestk <- which.min(err)
print(sprintf("best k = %d", bestk))
# Now retrain the model with the best value of k.
knn.pred <- knn(
train = train.X,
test = test.X,
cl = train.Y,
k = bestk
)
# Output the percentage of correct predictions for best value of k.
print(sprintf("percentage correct = %.2f", 100*mean(knn.pred == test.Y)))
# Output the prediction error for best value of k.
print(sprintf("prediction error = %.4f", mean(knn.pred != test.Y)))
# Output the confusion matrix for best value of k.
print(table(
"Predicted" = knn.pred,
"Actual" = test.Y
))
}
This question should be answered using the Weekly data set, which is part of the ISLR package.
# Attach the dataframe, for convenience.
attach(Weekly)
(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
# Eyeball column names.
names(Weekly)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
# Check data dimensions.
dim(Weekly)
## [1] 1089 9
# Print summary statistics.
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
# Plot correlation matrix.
ggpairs(
data = Weekly,
mapping = aes(
colour = Direction,
alpha = 0.4
)
)
# Check correlation values (excluding Direction, since this is categorical).
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
# Year and Volume appear correlated, plot the mean volume (plus indication of daily range) per year.
means <- aggregate(
formula = Volume ~ Year,
data = Weekly,
FUN = mean
)
ggplot() +
geom_point(
data = Weekly,
alpha = 0.4,
aes(
x = Year,
y = Volume
)
) +
geom_line(
data = means,
alpha = 0.4,
colour = "blue",
aes(
x = Year,
y = Volume
)
) +
ggtitle("Volume / Mean Volume by Year") +
labs(
x = "Year",
y = "Volume"
)
(b) Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
# Fit a glm model to our data.
glm.fit <- glm(
formula = Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Weekly,
family = binomial
)
# Output summary stats.
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
# See below. The confusion matrix is telling us which predictions our model got right (true negative for Down and true positive for Up), as well as which predictions it got wrong (false negative for Down - when the actual Direction was Up - and false positive for Up, when the actual direction was Down.
In this case, the model got a total of (54+557)/1089 predictions correct, or 56.11% (increasing to 56.43% if we just take the cases where the model predicts Up). This is better than chance, although just predicting the majority class (48+557 actual instances of Up) would have resulted in 55.55% correct predictions. Of course, all of this is based on a single data set rather than a training/test split - so we have no idea how these percentages will generalise.# Use the glm model to predict the probability that Direction is Up, given just the predictors.
glm.probs <- predict(
object = glm.fit,
type = "response"
)
# Translate the probabilities into categorical responses.
numrows <- nrow(Weekly)
glm.pred <- rep("Down", numrows)
glm.pred[glm.probs >.5] = "Up"
# Output the confusion matrix.
table(
"Predicted" = glm.pred,
"Actual" = Direction
)
## Actual
## Predicted Down Up
## Down 54 48
## Up 430 557
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction))
## [1] "fraction of correct predictions: 0.5611"
(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
# Create our "binary filter" to apply to the dataframe.
train <- (Year < 2009)
# Slice out the data where the filter values are FALSE e.g. Year >= 2009. This becomes our test data set.
Weekly.test <- Weekly[!train,]
Direction.test <- Direction[!train]
numrows.test <- nrow(Weekly.test)
# Fit a glm model to the subset of the Weekly data which matches filter values of TRUE (e.g. Year < 2009).
# Note that we're only using Lag2 as predictor, as instructed.
glm.fit <- glm(
formula = Direction~Lag2,
data = Weekly,
family = binomial,
subset = train
)
# Use the glm model to predict the probability that Direction is Up, given just the predictors.
glm.probs <- predict(
object = glm.fit,
newdata = Weekly.test,
type = "response"
)
# Translate the probabilities into categorical responses.
glm.pred <- rep("Down", numrows.test)
glm.pred[glm.probs >.5] = "Up"
# Output the confusion matrix.
table(
"Predicted" = glm.pred,
"Actual" = Direction.test
)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction.test))
## [1] "fraction of correct predictions: 0.6250"
(g) Repeat (d) using KNN with K=1
# Create test/training matricies from Lag2 column.
train.X <- as.matrix(Lag2[train])
test.X <- as.matrix(Lag2[!train])
Direction.train <- Direction[train]
knn.pred = knn(
train = train.X,
test = test.X,
cl = Direction.train,
k = 1
)
# Output the confusion matrix.
table(
"Predicted" = knn.pred,
"Actual" = Direction.test
)
## Actual
## Predicted Down Up
## Down 21 29
## Up 22 32
# Output the fraction of correct predictions.
sprintf("fraction of correct predictions: %.4f", mean(knn.pred == Direction.test))
## [1] "fraction of correct predictions: 0.5096"
(h) Which of these methods appears to provide the best results on this data?
(i) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
# See below; adding an interaction between Lag1 and the product of Lag1 and Lag2 to the logistic regression formula increases the percentage of correct predictions to 64.42%. Despite trying values of k from 1:100, KNN could not match this (the best value of k being 72, with 61.54% correct classifications).
Note that the question explicity asks us to evaluate which variable “provide the best results on the held out data”, even though this means that we’re implicitly tuning our model to the test data (rather than using e.g. cross-validation on the training set).# Logistic Regression
glm.fit <- glm(
formula = Direction~Lag2+Lag1:I(Lag1*Lag2),
data = Weekly,
family = binomial,
subset = train
)
glm.probs <- predict(
object = glm.fit,
newdata = Weekly.test,
type = "response"
)
glm.pred <- rep("Down", numrows.test)
glm.pred[glm.probs >.5] <- "Up"
table(
"Predicted" = glm.pred,
"Actual" = Direction.test
)
## Actual
## Predicted Down Up
## Down 8 2
## Up 35 59
sprintf("fraction of correct predictions: %.4f", mean(glm.pred == Direction.test))
## [1] "fraction of correct predictions: 0.6442"
# KNN
best_knn(100, train.X, test.X, Direction.train, Direction.test)
## [1] "best k = 72"
## [1] "percentage correct = 61.54"
## [1] "prediction error = 0.3846"
## Actual
## Predicted Down Up
## Down 15 12
## Up 28 49
(a) Write a function, Power(), that prints out the result of raising 2 to the 3rd power. In other words, your function should compute 23 and print out the results.
power <- function() {
sprintf("2 ^ 3 = %.2f", (2 ^ 3))
}
power()
## [1] "2 ^ 3 = 8.00"
(b) Create a new function, Power2(), that allows you to pass any two numbers, x and a, and prints out the value of x^a.
power2 <- function(x, a) {
sprintf("power2(%d,%d) = %.2f", x, a, (x^a))
}
(c) Using the Power2() function that you just wrote, compute 103, 817, and 1313.
power2(10,3)
## [1] "power2(10,3) = 1000.00"
power2(8,17)
## [1] "power2(8,17) = 2251799813685248.00"
power2(131,3)
## [1] "power2(131,3) = 2248091.00"
(d) Now create a new function, Power3(), that actually returns the result x^a as an R object, rather than simply printing it to the screen.
power3 <- function(x, a) {
return(x^a)
}
(e) Now using the Power3() function, create a plot of f(x)=x2. The x-axis should display a range of integers from 1 to 10, and the y-axis should display x2. Label the axes appropriately, and use an appropriate title for the figure. Consider displaying either the x-axis, the y-axis, or both on the log-scale.
x <- 1:10
xsquared <- power3(x,2)
ggplot() +
geom_line(
alpha = 0.4,
aes(
x = x,
y = xsquared
)
) +
scale_x_log10() +
scale_y_log10() +
ggtitle("x^2 (log10 x and y axes)") +
labs(
x = "x",
y = "x^2"
)
(f) Create a function, PlotPower(), that allows you to create a plot of x against x^a for a fixed a and for a range of values of x. For instance, if you call PlotPower (1:10,3) then a plot should be created with an x-axis taking on values 1,2,…,10, and a y-axis taking on values 13,23,…,103.
PlotPower <- function(xrange, pow) {
label <- paste("x^", pow, sep="")
plotline = geom_line(
aes(
x = xrange,
y = power3(xrange, pow)
)
)
plotlabs = labs(
x = "x",
y = label
)
p1 <- ggplot() + plotline + ggtitle(label) + plotlabs
label2 <- paste(label, "(log10 x-axis)", sep=" ")
p2 <- ggplot() + plotline + ggtitle(label2) + plotlabs + scale_x_log10()
label3 <- paste(label, "(log10 y-axis)", sep=" ")
p3 <- ggplot() + plotline + ggtitle(label3) + plotlabs + scale_y_log10()
label4 <- paste(label, "(log10 x and y axes)", sep=" ")
p4 <- ggplot() + plotline + ggtitle(label4) + plotlabs + scale_x_log10() + scale_y_log10()
plot_grid(p1, p2, p3, p4)
}
PlotPower(1:10,3)
Finish Exercise 10 from last homework
(e) Repeat (d) using LDA.
lda.fit <- lda(
formula = Direction~Lag2,
data = Weekly,
subset = train
)
lda.pred <- predict(
object = lda.fit,
newdata = Weekly.test
)
lda.class <- lda.pred$class
table(
"Predicted" = lda.class,
"Actual" = Direction.test
)
## Actual
## Predicted Down Up
## Down 9 5
## Up 34 56
sprintf("fraction of correct predictions: %.4f", mean(lda.class == Direction.test))
## [1] "fraction of correct predictions: 0.6250"
(f) Repeat (d) using QDA.
qda.fit <- qda(
formula = Direction~Lag2,
data = Weekly,
subset = train
)
qda.pred <- predict(
object = qda.fit,
newdata = Weekly.test
)
qda.class <- qda.pred$class
table(
"Predicted" = qda.class,
"Actual" = Direction.test
)
## Actual
## Predicted Down Up
## Down 0 0
## Up 43 61
sprintf("fraction of correct predictions: %.4f", mean(qda.class == Direction.test))
## [1] "fraction of correct predictions: 0.5865"
# Housekeeping.
clear_workspace()
detach(Weekly)
(a) Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median.
# Create a factor column for our mpg01 variable.
mpg.median <- median(Auto$mpg)
mpg01 <- as.factor(ifelse(Auto$mpg < mpg.median, 0, 1))
# Create a new version of Auto with the mpg01 column added.
Auto <- data.frame(Auto, mpg01)
# Remove temporary variables to avoid confusion due to e.g. shadowing.
rm(mpg.median, mpg01)
# Only attach our Auto frame now, to avoid confusion due to e.g. shadowing.
# Use of attach/detach is of course debatable due to these issues..
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name mpg01
## amc matador : 5 0:196
## ford pinto : 5 1:196
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
(b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
All of the variables except name appear to be correlated with mpg - and by extension, mpg01 - to some extent. However, there are likely to be significant colinearities between them - with more cylinders, displacement and horsepower, for example, resulting in greater weight (e.g. a bigger engine).
VIF results from a fitted linear model confirmed this; iteratively removing predictors with VIF > 5 left only cylinders, acceleration, year and origin - with only cylinders, year and origin being significantly correlated. Of these, there is a definite suggestion of non-linearity from the scatter plot of cylinders vs mpg - and a polynomial of degree three proved significant.# Plot correlation matrix - using ggpairs (since this automatically gives us both scatterplots and boxplots).
ggpairs(
data = Auto[,-9],
mapping = aes(
colour = mpg01,
alpha = 0.4
)
)
# Check for colinearities.
glm.fit <- glm(
formula = mpg01~cylinders+displacement+horsepower+weight+acceleration+year+origin,
family = binomial,
data = Auto
)
vif(glm.fit)
## cylinders displacement horsepower weight acceleration
## 4.900837 8.852836 2.691385 4.865771 2.651544
## year origin
## 1.686770 1.901268
# Remove any predictors with vif > 5 and repeat until all < 5
glm.fit <- update(glm.fit, . ~ . -displacement)
vif(glm.fit)
## cylinders horsepower weight acceleration year
## 2.107526 2.673592 3.392873 2.618422 1.596734
## origin
## 1.307502
# Summary of our final model - only cylinders, year and origin have significant p values
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + horsepower + weight + acceleration +
## year + origin, family = binomial, data = Auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3976 -0.1066 0.0083 0.2129 3.1958
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.706e+01 5.735e+00 -2.974 0.00294 **
## cylinders -1.071e-01 2.780e-01 -0.385 0.70014
## horsepower -4.126e-02 2.385e-02 -1.730 0.08357 .
## weight -4.208e-03 9.541e-04 -4.410 1.03e-05 ***
## acceleration 1.365e-02 1.407e-01 0.097 0.92268
## year 4.266e-01 7.314e-02 5.833 5.44e-09 ***
## origin 4.422e-01 2.994e-01 1.477 0.13965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 157.57 on 385 degrees of freedom
## AIC: 171.57
##
## Number of Fisher Scoring iterations: 8
glm.fit <- update(glm.fit, . ~ . -acceleration)
# Checking for any significant polynomials since relationship between cylinders and mpg appears non-linear
glm.fit <- update(glm.fit, . ~ . - cylinders + poly(cylinders,3))
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ horsepower + weight + year + origin + poly(cylinders,
## 3), family = binomial, data = Auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.79864 -0.10001 0.00351 0.14682 2.63330
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.244e+01 6.062e+00 -3.701 0.000215 ***
## horsepower -4.209e-02 1.753e-02 -2.402 0.016327 *
## weight -4.489e-03 9.008e-04 -4.984 6.24e-07 ***
## year 5.071e-01 8.882e-02 5.710 1.13e-08 ***
## origin 6.952e-01 3.424e-01 2.030 0.042332 *
## poly(cylinders, 3)1 1.136e+01 1.018e+01 1.116 0.264447
## poly(cylinders, 3)2 1.602e+01 4.857e+00 3.299 0.000970 ***
## poly(cylinders, 3)3 1.337e+01 3.260e+00 4.100 4.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 132.39 on 384 degrees of freedom
## AIC: 148.39
##
## Number of Fisher Scoring iterations: 8
formula <- as.formula(mpg01 ~ cylinders + I(cylinders^3) + year + origin)
(c) Split the data into a training set and a test set.
# Am using a 50/50 split per the ISLR examples
train <- sample(nrow(Auto), nrow(Auto)/2)
Auto.train <- Auto[train,]
Auto.test <- Auto[-train,]
mpg01.test <- mpg01[-train]
numrows.test <- nrow(Auto.test)
(d) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
lda.fit <- lda(
formula = formula,
data = Auto.train
)
lda.pred <- predict(
object = lda.fit,
newdata = Auto.test
)
lda.class = lda.pred$class
table(
"Predicted" = lda.class,
"Actual" = mpg01.test
)
## Actual
## Predicted 0 1
## 0 79 5
## 1 14 98
sprintf("test error: %.4f", mean(lda.class != mpg01.test))
## [1] "test error: 0.0969"
(e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
qda.fit <- qda(
formula = formula,
data = Auto.train
)
qda.pred <- predict(
object = qda.fit,
newdata = Auto.test
)
qda.class <- qda.pred$class
table(
"Predicted" = qda.class,
"Actual" = mpg01.test
)
## Actual
## Predicted 0 1
## 0 79 2
## 1 14 101
sprintf("test error: %.4f", mean(qda.class != mpg01.test))
## [1] "test error: 0.0816"
(f) Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
glm.fit <- glm(
formula = formula,
data = Auto.train,
family = binomial
)
glm.probs <- predict(
object = glm.fit,
newdata = Auto.test,
type = "response"
)
glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] = 1
table(
"Predicted" = glm.pred,
"Actual" = mpg01.test
)
## Actual
## Predicted 0 1
## 0 75 0
## 1 18 103
sprintf("test error: %.4f", mean(glm.pred != mpg01.test))
## [1] "test error: 0.0918"
(g) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?
# KNN
cylinders3 = cylinders ^ 3
standardized.X <- scale(data.frame(cylinders3, Auto)[, 1:9])
predictors = c("cylinders", "cylinders3", "year", "origin")
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
train.Y <- mpg01[train]
test.Y <- mpg01[-train]
best_knn(100, train.X, test.X, train.Y, test.Y)
## [1] "best k = 8"
## [1] "percentage correct = 91.84"
## [1] "prediction error = 0.0816"
## Actual
## Predicted 0 1
## 0 78 1
## 1 15 102
clear_workspace()
detach(Auto)
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
# See below (especially in-line comments). I adopted two subsets of predictors, one based on p-values (having adjusted for VIF), the other based on best subsets. Logistic regression performed better than LDA, with the “significant predictors” model performing better (test error 9.88%) than best subset (test error of 11.46%).
However KNN (k=1) beat both parametric models, with a “significant predictors” test error of 3.95%. This is not surprising, since the data represents geographical areas and is more likely to be consistent within suburbs than between them. Especially in downtown regions (presumably observations ~375 to ~475) which may be special cases.# As before - construct our new factor variable, add it to our dataframe, cleanup, and then attach.
crim.median <- median(Boston$crim)
crim01 <- as.factor(ifelse(Boston$crim < crim.median, 0, 1))
Boston <- data.frame(Boston, crim01)
rm(crim.median, crim01)
attach(Boston)
# Show scatterplots and boxplots.
ggpairs(
data = Boston[,-9],
mapping = aes(
colour = crim01,
alpha = 0.4
)
)
# Check for colinearities.
glm.fit <- glm(
formula = crim01~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv,
data = Boston,
family = binomial
)
vif(glm.fit)
## zn indus chas nox rm age dis rad
## 1.872010 2.640170 1.233983 4.111013 5.734988 2.455666 3.876479 1.983825
## tax ptratio black lstat medv
## 1.932622 2.191848 1.107171 2.670299 8.088729
# Remove any predictors with vif > 5 and repeat until all < 5
glm.fit <- update(glm.fit, . ~ . -medv)
vif(glm.fit)
## zn indus chas nox rm age dis rad
## 1.731690 2.622190 1.171490 3.701289 2.239184 1.916569 2.964372 1.872988
## tax ptratio black lstat
## 1.811987 1.512531 1.105251 2.659335
# Check which predictors are significant - leaving zn, nox, rm, dis, rad, tax, ptratio.
summary(glm.fit)
##
## Call:
## glm(formula = crim01 ~ zn + indus + chas + nox + rm + age + dis +
## rad + tax + ptratio + black + lstat, family = binomial, data = Boston)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5155 -0.2144 -0.0008 0.0031 3.2765
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -32.394832 6.327425 -5.120 3.06e-07 ***
## zn -0.075484 0.031780 -2.375 0.01754 *
## indus -0.055163 0.043127 -1.279 0.20087
## chas 0.866508 0.751363 1.153 0.24881
## nox 44.493847 6.937852 6.413 1.42e-10 ***
## rm 1.034095 0.420817 2.457 0.01400 *
## age 0.007593 0.010389 0.731 0.46485
## dis 0.479160 0.190858 2.511 0.01205 *
## rad 0.643633 0.146068 4.406 1.05e-05 ***
## tax -0.007432 0.002624 -2.833 0.00462 **
## ptratio 0.208206 0.098197 2.120 0.03398 *
## black -0.012674 0.006695 -1.893 0.05833 .
## lstat 0.043258 0.047431 0.912 0.36176
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 218.82 on 493 degrees of freedom
## AIC: 244.82
##
## Number of Fisher Scoring iterations: 9
glm.fit <- update(glm.fit, . ~ zn+nox+rm+dis+rad+tax+ptratio)
# Check for non-linearity (plot residuals).
# Interestingly, plotting the residuals vs crim (not crim01) shows that things go a bit crazy around
# row 400. A plot of glm.fit flags up rows 381, 406 and 419 - which do appear to be outliers, having
# the three highest values of crim (all above 73, vs a median of 0.2565). Presumably these are city
# centre areas; they do have higher tax and lower dis values. Maybe splines would help with handling
# this(?)
ggplot() +
geom_point(
alpha = 0.4,
aes(
x = 1:nrow(Boston),
y = lm(crim ~ zn+nox+rm+dis+rad+tax+ptratio)$residuals
)
) +
ggtitle("Residuals for crim~zn+nox+rm+dis+rad+tax+ptratio") +
labs(
x = "observation",
y = "residual"
)
# The residuals from the binomial model is consistent with this idea, in that there are a series
# of flat lines (presumably similar types of suburb) interspersed by small regions of variability.
# Possibly there is some kind of geographical ordering to the Boston data that results in this.
ggplot() +
geom_point(
alpha = 0.4,
aes(
x = 1:nrow(Boston),
y = glm.fit$residuals
)
) +
scale_y_continuous(
limits = c(0, 10),
oob = scales::squish
) +
ggtitle("Residuals for crim01~zn+nox+rm+dis+rad+tax+ptratio") +
labs(
x = "observation",
y = "residual"
)
# By way of an alternative to basing the model on p-values, I'm checking best subsets.
# Note I'm using Cp as estimate of test error, on the whole data set, for convenience.
nvmax <- 13
Boston.best = regsubsets(
x = crim01~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv,
data=Boston,
nvmax=nvmax
)
best.summary = summary(Boston.best)
bestnv <- which.min(best.summary$cp)
sprintf("best nv: %d", bestnv)
## [1] "best nv: 6"
Boston.best.coef <- coef(Boston.best, bestnv)
print(names(Boston.best.coef[-1]))
## [1] "zn" "nox" "age" "rad" "black" "medv"
# Create training/test split
train <- sample(nrow(Boston), nrow(Boston)/2) # sticking with 50/50 split per ISLR examples.
Boston.train <- Boston[train,]
Boston.test <- Boston[-train,]
numrows.test <- nrow(Boston.test)
crim01.test <- crim01[-train]
significant.predictors <- as.formula(crim01 ~ zn+nox+rm+dis+rad+tax+ptratio)
best.subset <- as.formula(crim01~zn+nox+age+rad+black+medv)
# Logistic Regression - using best subset
glm.fit <- glm(
formula = best.subset,
data = Boston.train,
family = binomial
)
glm.probs <- predict(
glm.fit,
Boston.test,
type="response"
)
glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] <- 1
table(
"Predicted" = glm.pred,
"Actual" = crim01.test
)
## Actual
## Predicted 0 1
## 0 118 19
## 1 10 106
sprintf("prediction error: %.4f", mean(glm.pred != crim01.test))
## [1] "prediction error: 0.1146"
# Logistic regression - using significant predictors
glm.fit <- glm(
formula = significant.predictors,
data = Boston.train,
family = binomial
)
glm.probs <- predict(
glm.fit,
Boston.test,
type="response"
)
glm.pred <- rep(0, numrows.test)
glm.pred[glm.probs >.5] <- 1
table(
"Predicted" = glm.pred,
"Actual" = crim01.test
)
## Actual
## Predicted 0 1
## 0 118 15
## 1 10 110
sprintf("prediction error: %.4f", mean(glm.pred != crim01.test))
## [1] "prediction error: 0.0988"
# LDA - using best subset
lda.fit <- lda(
formula = best.subset,
data = Boston.train
)
lda.pred <- predict(
object = lda.fit,
newdata = Boston.test
)
lda.class <- lda.pred$class
table(
"Predicted" = lda.class,
"Actual" = crim01.test
)
## Actual
## Predicted 0 1
## 0 122 31
## 1 6 94
sprintf("prediction error: %.4f", mean(lda.class != crim01.test))
## [1] "prediction error: 0.1462"
# LDA - using significant predictors
lda.fit <- lda(
formula = significant.predictors,
data = Boston.train
)
lda.pred <- predict(
object = lda.fit,
newdata = Boston.test
)
lda.class <- lda.pred$class
table(
"Predicted" = lda.class,
"Actual" = crim01.test
)
## Actual
## Predicted 0 1
## 0 118 33
## 1 10 92
sprintf("test prediction error: %.4f", mean(lda.class != crim01.test))
## [1] "test prediction error: 0.1700"
# KNN - Using best subset
maxk <- 100
standardized.X <- scale(Boston[, 1:14])
train.Y <- crim01[train]
test.Y <- crim01[-train]
predictors <- all.vars(best.subset[[3]])
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
best_knn(maxk, train.X, test.X, train.Y, test.Y)
## [1] "best k = 3"
## [1] "percentage correct = 89.33"
## [1] "prediction error = 0.1067"
## Actual
## Predicted 0 1
## 0 118 17
## 1 10 108
# KNN - Using significant predictors
predictors <- all.vars(significant.predictors[[3]])
train.X <- standardized.X[train, predictors]
test.X <- standardized.X[-train, predictors]
best_knn(maxk, train.X, test.X, train.Y, test.Y)
## [1] "best k = 1"
## [1] "percentage correct = 96.05"
## [1] "prediction error = 0.0395"
## Actual
## Predicted 0 1
## 0 126 8
## 1 2 117
In the lab, we applied random forests to the Boston data using mtry=6 and using ntree=25 and ntree=500. Create a plot displaying the test error resulting from random forests on this data set for a more comprehensive range of values for mtry and ntree. Describe the results obtained.
# Range of values for mtry (number of variables to select from).
# There are only 13 (not including our predictor, medv), so we may as well try them all.
mtry <- seq(1,13)
# Less sure what to cover in terms of range of ntree (number of trees to build). It's
# likely that there will be a bigger different between 2 trees vs 5, 10, etc, than there
# will be between 100 and 200. So I'm using a sequence of 1 to 10, in steps of 1, and then
# a sequence of 25 to 500, in steps of 25.
ntree <- c(1:10, seq(25,500,25))
# Set up a vector to contain our test errors for each combination of mtry and ntree.
err <- vector('numeric', length(mtry) * length(ntree))
index <- 1
# And now loop through the combinations, fitting a random forest model to each.
for (i in mtry) {
for (j in ntree) {
# fit the model to the training set
rf.boston <- randomForest(
formula = medv ~ . -crim01, # exclude crim01 from the previous question
data = Boston,
subset = train,
mtry = i,
ntree = j
)
# predict medv from the test data
medv.rf <- predict(
object = rf.boston,
newdata = Boston.test
)
# store the error for later
mse <- mean((medv.rf - Boston.test$medv) ^ 2)
err[index] <- mse
index <- index + 1
}
}
# Now analyse the errors we recorded during the looop.
min.err.index = which.min(err)
min.err.mtry = floor(min.err.index / length(ntree))
min.err.ntree = ntree[min.err.index - (min.err.mtry * length(ntree))]
sprintf("min error %.4f found with mtry=%d, ntree=%d", err[min.err.index], min.err.mtry, min.err.ntree)
## [1] "min error 11.9062 found with mtry=7, ntree=7"
# Do a basic 2-D plot of ntree vs error, with 13 lines representing each
# of our 13 values of mtry.
colors <- rainbow(max(mtry))
lty <- c(rep(1, 8),rep(2,5))
for (i in mtry) {
numx = 10 # just plotting the first 10 error values for each mtry, for clarity
start <- (i-1)*numx+1
end <- i*numx
errs <- err[start:end]
col <- colors[i]
if (i == 1) {
plot(ntree[1:numx], errs, col = col, type = "l", xlab = "ntree", ylab = "mse", ylim=c(9,65))
}
else {
lines(ntree[1:numx], errs, col = col, type = "l", lty = lty[i])
}
}
legend("topright", as.character(mtry), col = mtry+1, cex = 1, lty = lty, title="mtry")
# We have an error that varies along two dimensions, e.g. a surface.
# So a 3-D representation might be helpful. For which we need a matrix,
# not a vector.
z <- matrix(err, nrow=length(mtry), ncol=length(ntree), byrow=TRUE)
# Try the default persp plot from R
persp(
x = mtry,
y = ntree,
z = z,
theta = 45,
phi = 45,
zlab = "mse"
)
# Also try heatmap2 from ggplot2
heatmap.2(
z,
Rowv = "none",
Colv = "none",
dendrogram = "none",
density.info = "none",
trace = "none",
labRow = mtry,
labCol = ntree,
xlab = "ntree",
ylab = "mtry",
key.xlab = "mse",
col = diverge_hcl(length(mtry), c = 100, l = c(50, 90), power = 1)
)
clear_workspace()
detach(Boston)
In the lab, a classification tree was applied to the Carseats data set after converting Sales into a qualitative response variable. Now we will seek to predict Sales using regression trees and related approaches, treating the response as a quantitative variable.
attach(Carseats)
(a) Split the data set into a training set and a test set.
# Using same 50/50 split as the book.
train <- sample(nrow(Carseats), nrow(Carseats)/2)
Carseats.test <- Carseats[-train,]
(b) Fit a regression tree to the training set. Plot the tree, and interpret the results. What test MSE do you obtain?
Carseats.tree <- tree(
formula = Sales~.,
data = Carseats,
subset = train
)
summary(Carseats.tree)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats, subset = train)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "CompPrice" "Age" "Advertising"
## Number of terminal nodes: 14
## Residual mean deviance: 2.124 = 395 / 186
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.22800 -0.90350 0.05843 0.00000 0.79680 3.79300
plot(Carseats.tree)
text(Carseats.tree, pretty = 0, font = 4, cex = 0.75)
Carseats.pred <- predict(
object = Carseats.tree,
newdata = Carseats.test)
sprintf("test mse: %.4f", mean((Carseats.pred - Carseats.test$Sales) ^ 2))
## [1] "test mse: 4.7117"
(c) Use cross-validation in order to determine the optimal level of tree complexity. Does pruning the tree improve the test MSE?
# Perform the cross-validation
Carseats.cv.tree <- cv.tree(Carseats.tree)
# Output the results.
Carseats.cv.tree
## $size
## [1] 14 13 12 11 10 9 8 7 6 5 4 3 2 1
##
## $dev
## [1] 808.0497 829.7067 839.8778 863.0028 871.3421 1011.5556 1025.3040
## [8] 1030.4866 1030.4866 1066.6634 1087.5819 1133.3372 1193.5623 1471.3207
##
## $k
## [1] -Inf 18.02420 18.50311 19.21496 22.57085 47.82583 49.58968
## [8] 51.36982 54.55920 83.87959 99.43388 114.56276 173.30361 285.82782
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# Determine the best tree size
sprintf("best tree size: %d",
Carseats.cv.tree$size[which.min(Carseats.cv.tree$dev)]
)
## [1] "best tree size: 14"
# Plot deviance vs tree size
ggplot() +
geom_line(
colour = "blue",
alpha = 0.4,
aes(
x = Carseats.cv.tree$size,
y = Carseats.cv.tree$dev
)
) +
scale_x_continuous(
breaks = Carseats.cv.tree$size
) +
ggtitle("Deviance vs Tree Size") +
labs(
x = "tree size",
y = "deviance"
)
# Pruning doesn't seem to be necessary in this case
(d) Use the bagging approach in order to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important.
# Fit randomForest model to the training data, using all predictors
# e.g. bagging. Specify importance = TRUE so that we can use this
# feature for later interpretation.
Carseats.bag <- randomForest(
formula = Sales~.,
data = Carseats,
subset = train,
mtry = length(Carseats)-1,
importance = TRUE
)
# Print the model descriptives
Carseats.bag
##
## Call:
## randomForest(formula = Sales ~ ., data = Carseats, mtry = length(Carseats) - 1, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.56219
## % Var explained: 64.26
# Now predict the values of Sales in the test data.
Carseats.pred <- predict(
object = Carseats.bag,
newdata = Carseats.test
)
# Print/calculate the MSE from test data predictions.
sprintf("test mse: %.4f", mean((Carseats.pred - Carseats.test$Sales) ^ 2))
## [1] "test mse: 2.8308"
# See which predictors made the biggest contributions to MSE / node purity.
importance(Carseats.bag)
## %IncMSE IncNodePurity
## CompPrice 23.6361692 143.294132
## Income 2.1541598 66.972171
## Advertising 11.9378536 101.667610
## Population 0.9711789 57.362038
## Price 56.4945990 474.718399
## ShelveLoc 45.2439143 357.441953
## Age 14.6443266 153.419118
## Education -4.2397685 30.460911
## Urban -0.8729681 4.965755
## US -0.8521151 5.722876
(e) Use random forests to analyze this data. What test MSE do you obtain? Use the importance() function to determine which variables are most important. Describe the effect of m, the number of variables considered at each split, on the error rate obtained.
# See below. For 1 <= m <= 10, MSE ranged from 2.8172 to 5.4342, with an average of 3.3177. The importance function suggests that Price and ShelveLoc are the two most important predictors, followed by CompPrice, and then Age and Advertising.
MSE was monotonically decreasing with (or at least, not increasing) with m, with the lowest test MSE using the full 10 predictor variables. The test MSE was also similar to that from the bagging model. This suggests that the predictor variables were not strongly correlated, otherwise we would expect to see an improvement using randomForest (with its decorrelation of predictors). This was supported by a call to VIF against a standard linear model.# Sales~. has 10 predictors, so we're going to build 10 trees - with each tree considering one of 1:10 variables at each split.
mtrys <- seq(1,10)
err <- vector('numeric', length(mtrys))
index <- 1
for (mtry in mtrys) {
# Loop round fitting each tree to the training data,
# and storing it's associated test MSE.
Carseats.rf <- randomForest(
formula = Sales~.,
data = Carseats,
subset = train,
mtry = mtry,
importance=FALSE # no need as we're gonna refit in a bit
)
Carseats.pred <- predict(
object = Carseats.rf,
newdata = Carseats.test
)
mse <- mean((Carseats.pred - Carseats.test$Sales) ^ 2)
err[index] <- mse
index <- index + 1
}
# Now plot the test MSE vs the number of variables considered at each split.
plot(err)
# Print some summary stats.
sprintf("mse: min=%.4f, max=%.4f, mean=%.4f", min(err), max(err), mean(err))
## [1] "mse: min=2.8172, max=5.4342, mean=3.3177"
# Find which mtry resulted in minimum test MSE.
best.mtry <- which.min(err)
sprintf("best mtry: %d", best.mtry)
## [1] "best mtry: 10"
# Refit a tree to that value of mtry, with Importance enabled.
Carseats.rf <- randomForest(
formula = Sales~.,
data = Carseats,
subset = train,
mtry = best.mtry,
importance=TRUE
)
# And check each variable's contribution to MSE/node purity.
importance(Carseats.rf)
## %IncMSE IncNodePurity
## CompPrice 22.2323823 139.164686
## Income 2.4258522 68.609560
## Advertising 11.1183666 93.608566
## Population 0.3951717 54.022213
## Price 56.1352408 476.750617
## ShelveLoc 49.1109358 366.511018
## Age 18.3997934 157.938858
## Education -2.3454048 29.789933
## Urban -0.4835734 5.476033
## US -0.4938681 5.537742
# The best value of mtry was 10; the full number of predictors - suggesting that the
# predictors are not strongly correlated (otherwise we would have expected to see a
# difference in the RF model vs bagging). Check this with VIF.
vif(glm(formula = Sales~., data = Carseats))
## GVIF Df GVIF^(1/(2*Df))
## CompPrice 1.554618 1 1.246843
## Income 1.024731 1 1.012290
## Advertising 2.103136 1 1.450219
## Population 1.145534 1 1.070296
## Price 1.537068 1 1.239785
## ShelveLoc 1.033891 2 1.008367
## Age 1.021051 1 1.010471
## Education 1.026342 1 1.013086
## Urban 1.022705 1 1.011289
## US 1.980720 1 1.407380
clear_workspace()
detach(Carseats)
This problem involves the OJ data set which is part of the ISLR package.
attach(OJ)
(a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations.
train <- sample(nrow(OJ), 800)
OJ.test <- OJ[-train,]
(b) Fit a tree to the training data, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics about the tree, and describe the results obtained. What is the training error rate? How many terminal nodes does the tree have?
# Fit classification tree to the training data.
OJ.tree <- tree(
formula = Purchase~.,
data = OJ,
subset = train)
# Output summary info.
summary(OJ.tree)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = OJ, subset = train)
## Variables actually used in tree construction:
## [1] "LoyalCH" "SalePriceMM" "PriceDiff" "DiscMM"
## Number of terminal nodes: 7
## Residual mean deviance: 0.7636 = 605.6 / 793
## Misclassification error rate: 0.1713 = 137 / 800
(c) Type in the name of the tree object in order to get a detailed text output. Pick one of the terminal nodes, and interpret the information displayed.
print(OJ.tree)
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 1079.000 CH ( 0.59625 0.40375 )
## 2) LoyalCH < 0.48285 318 342.600 MM ( 0.22956 0.77044 )
## 4) LoyalCH < 0.0356415 58 10.100 MM ( 0.01724 0.98276 ) *
## 5) LoyalCH > 0.0356415 260 306.800 MM ( 0.27692 0.72308 )
## 10) SalePriceMM < 2.04 140 128.300 MM ( 0.17143 0.82857 ) *
## 11) SalePriceMM > 2.04 120 161.500 MM ( 0.40000 0.60000 ) *
## 3) LoyalCH > 0.48285 482 426.700 CH ( 0.83817 0.16183 )
## 6) LoyalCH < 0.764572 231 280.000 CH ( 0.70563 0.29437 )
## 12) PriceDiff < 0.145 87 119.700 MM ( 0.44828 0.55172 )
## 24) DiscMM < 0.47 71 98.070 CH ( 0.53521 0.46479 ) *
## 25) DiscMM > 0.47 16 7.481 MM ( 0.06250 0.93750 ) *
## 13) PriceDiff > 0.145 144 116.000 CH ( 0.86111 0.13889 ) *
## 7) LoyalCH > 0.764572 251 84.050 CH ( 0.96016 0.03984 ) *
(d) Create a plot of the tree, and interpret the results.
plot(OJ.tree)
text(OJ.tree, pretty = 0, font = 4, cex = 0.75)
(e) Predict the response on the test data, and produce a confusion matrix comparing the test labels to the predicted test labels. What is the test error rate?
OJ.tree.pred <- predict(
object = OJ.tree,
newdata = OJ.test,
type = "class"
)
table(
"Predicted" = OJ.tree.pred,
"Actual" = OJ.test$Purchase
)
## Actual
## Predicted CH MM
## CH 153 29
## MM 23 65
sprintf("prediction error: %.4f", mean(OJ.tree.pred != OJ.test$Purchase))
## [1] "prediction error: 0.1926"
(f) Apply the cv.tree() function to the training set in order to determine the optimal tree size.
OJ.cv.tree <- cv.tree(
object = OJ.tree,
FUN = prune.misclass)
OJ.cv.tree
## $size
## [1] 7 5 2 1
##
## $dev
## [1] 156 156 151 323
##
## $k
## [1] -Inf 0.000000 4.666667 172.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
(g) Produce a plot with tree size on the x-axis and cross-validated classification error rate on the y-axis.
ggplot() +
geom_line(
colour = "blue",
alpha = 0.4,
aes(
x = OJ.cv.tree$size,
y = OJ.cv.tree$dev
)
) +
scale_x_continuous(
breaks = OJ.cv.tree$size
) +
ggtitle("Deviance vs Tree Size") +
labs(
x = "tree size",
y = "deviance"
)
(h) Which tree size corresponds to the lowest cross-validated classification error rate?
OJ.cv.tree.best <- OJ.cv.tree$size[which.min(OJ.cv.tree$dev)]
sprintf("best tree size: %d", OJ.cv.tree.best)
## [1] "best tree size: 2"
(i) Produce a pruned tree corresponding to the optimal tree size obtained using cross-validation. If cross-validation does not lead to selection of a pruned tree, then create a pruned tree with five terminal nodes.
OJ.tree.prune <- prune.misclass(OJ.tree, best=OJ.cv.tree.best)
(j) Compare the training error rates between the pruned and unpruned trees. Which is higher?
summary(OJ.tree.prune)
##
## Classification tree:
## snip.tree(tree = OJ.tree, nodes = 2:3)
## Variables actually used in tree construction:
## [1] "LoyalCH"
## Number of terminal nodes: 2
## Residual mean deviance: 0.9641 = 769.4 / 798
## Misclassification error rate: 0.1888 = 151 / 800
(k) Compare the test error rates between the pruned and unpruned trees. Which is higher?
OJ.tree.pred <- predict(
object = OJ.tree.prune,
newdata = OJ.test,
type = "class"
)
table(
"Predicted" = OJ.tree.pred,
"Actual" = OJ.test$Purchase
)
## Actual
## Predicted CH MM
## CH 155 32
## MM 21 62
sprintf("prediction error: %.4f", mean(OJ.tree.pred != OJ.test$Purchase))
## [1] "prediction error: 0.1963"
clear_workspace()
detach(OJ)
We now use boosting to predict Salary in the Hitters data set.
(a) Remove the observations for whom the salary information is unknown, and then log-transform the salaries.
Hitters <- na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
attach(Hitters)
(b) Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.
train <- 1:200
Hitters.train <- Hitters[train,]
Hitters.test <- Hitters[-train,]
(c) Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ. Produce a plot with different shrinkage values on the x-axis and the corresponding training set MSE on the y-axis.
# Specify our sequence of lambda values.
# Am using 0 to 0.01 in steps of 0.001, initially, and then
# 0.015 to 0.02 in steps of 0.01.
shrinkages <- c(seq(0, 0.01, 0.001),seq(0.015,0.2,0.01))
# Set up our data structure for storing mse.
err <- vector('numeric', length(shrinkages))
# Loop over our range of lambda values recording mse against training data.
index <- 1
for (shrinkage in shrinkages) {
Hitters.boost <- gbm(
formula = Salary~.,
data = Hitters.train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = shrinkage
)
Hitters.pred <- predict(
object = Hitters.boost,
n.trees = 1000
)
mse <- mean((Hitters.pred - Hitters.train$Salary) ^ 2)
err[index] <- mse
index <- index + 1
}
# Plot the results
ggplot() +
geom_line(
colour = "blue",
alpha = 0.4,
aes(
x = shrinkages,
y = err
)
) +
scale_x_continuous(
breaks = seq(0, 0.2, 0.01)
) +
theme(
axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
) +
ggtitle("Training MSE vs lambda") +
labs(
x = "lambda",
y = "err"
)
sprintf(
"Minimum training MSE %.4f found with lambda value %.3f", min(err), shrinkages[which.min(err)]
)
## [1] "Minimum training MSE 0.0215 found with lambda value 0.195"
(d) Produce a plot with different shrinkage values on the x-axis and the corresponding test set MSE on the y-axis.
# Loop over our range of lambda values recording mse against test data.
index <- 1
for (shrinkage in shrinkages) {
Hitters.boost <- gbm(
formula = Salary~.,
data = Hitters.train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = shrinkage
)
Hitters.pred <- predict(
object = Hitters.boost,
newdata = Hitters.test,
n.trees = 1000
)
mse <- mean((Hitters.pred - Hitters.test$Salary) ^ 2)
err[index] <- mse
index <- index + 1
}
# Plot the results
ggplot() +
geom_line(
colour = "blue",
alpha = 0.4,
aes(
x = shrinkages,
y = err
)
) +
scale_x_continuous(
breaks = seq(0, 0.2, 0.01)
) +
theme(
axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
) +
ggtitle("Test MSE vs lambda") +
labs(
x = "lambda",
y = "err"
)
best.shrinkage <- shrinkages[which.min(err)]
sprintf(
"Minimum test MSE %.4f found with lambda value %.3f", min(err), best.shrinkage
)
## [1] "Minimum test MSE 0.2508 found with lambda value 0.075"
(e) Compare the test MSE of boosting with the test MSE of the best linear regression model that you can build.
# See below. The Hitters dataset has 20 variables, of which we’re using 19 as predictors. This is amenable to analysis using best subsets, so my first choise was to use this method for model selection. However the lowest test MSE, with a subset of 8 predictors, was 0.4685 - much higher than the test MSE for the best boosted model (0.2508).
I subsequently adopted the “significant predictors” methodology, developed above, to remove high VIF predictors and keep any that remained with significant p-values. This led to a model with three predictors (Runs+Years+PutOuts), and a test MSE of 0.4888. Following this, I checked with polynomial terms (up to 6th power) for each of the three predictors, and kept any with significant p-values. This model resulted in a test MSE of 0.3001. I also tested a model including interactions between each of the variables; none of these had significant p-values.
The best regression model, purely on test MSE, then, is the one with 6th degree polynomials. However this doesn’t seem to be a very good (or even plausible) model, and it is entirely possible that the low test MSE is an artefact of the procedure.# Best Subsets
nvmax <- 19
# Fit our best subsets model to the training data.
Hitters.best <- regsubsets(
x = Salary~.,
data = Hitters.train,
nvmax = nvmax
)
# Create a model matrix representation of the test data, for use in matrix algebra below.
Hitters.test.mat <- model.matrix(
object = Salary~.,
data = Hitters.test
)
# Loop through best subsests of size 1:19 recording test MSE for each.
err <- vector('numeric', nvmax)
for (nv in 1:nvmax){
# Get the coefficients for the best subset of predictors of size nv.
coefi <- coef(Hitters.best, id=nv)
# Predict values of Salary for the test data, using inner product.
pred <- Hitters.test.mat[, names(coefi)]%*%coefi
# Store the test MSE for this subset.
err[nv] <- mean((Hitters.test$Salary - pred) ^ 2)
}
# Now find which number of predictors resulted in the lowest test MSE.
bestnv <- which.min(err)
sprintf("best nv: %d", bestnv)
## [1] "best nv: 8"
# And output the matching coefficients and MSE value.
Hitters.best.coef <- coef(Hitters.best, bestnv)
print(Hitters.best.coef)
## (Intercept) AtBat Hits Walks Years
## 4.4722395349 -0.0032378215 0.0149552471 0.0105182255 0.0647798866
## CRuns CWalks DivisionW PutOuts
## 0.0012175332 -0.0010136188 -0.1482230049 0.0005050225
sprintf("best subsets test mse: %.4f", err[bestnv])
## [1] "best subsets test mse: 0.4685"
# Significant Predictors
# Check for colinearities. As before, start with the full model, and then remove any
# predictors in turn until all VIF values are < 5.
glm.fit <- glm(
formula = Salary~.,
data = Hitters.train
)
vif(glm.fit)
## AtBat Hits HmRun Runs RBI Walks
## 27.616607 32.188398 8.338397 16.708030 13.146944 4.064924
## Years CAtBat CHits CHmRun CRuns CRBI
## 9.107056 249.914822 502.336602 44.567322 169.171173 126.740678
## CWalks League Division PutOuts Assists Errors
## 20.222379 3.815089 1.087212 1.261217 2.582138 2.229147
## NewLeague
## 3.823558
glm.fit = update(glm.fit, . ~ . -CHits)
vif(glm.fit)
## AtBat Hits HmRun Runs RBI Walks
## 25.441430 25.776755 8.046291 13.916420 13.094014 3.851500
## Years CAtBat CHmRun CRuns CRBI CWalks
## 8.829910 108.666041 24.613025 60.935774 81.489877 14.158997
## League Division PutOuts Assists Errors NewLeague
## 3.760900 1.087072 1.242772 2.534373 2.217990 3.772691
glm.fit = update(glm.fit, . ~ . -CAtBat)
vif(glm.fit)
## AtBat Hits HmRun Runs RBI Walks Years
## 23.638795 25.550055 7.732099 13.514884 12.724481 3.801000 6.434961
## CHmRun CRuns CRBI CWalks League Division PutOuts
## 16.402172 27.298898 49.293859 14.139714 3.760731 1.082907 1.241575
## Assists Errors NewLeague
## 2.446384 2.214559 3.756903
glm.fit = update(glm.fit, . ~ . -CRuns)
vif(glm.fit)
## AtBat Hits HmRun Runs RBI Walks Years
## 23.156178 25.173554 7.732079 12.145104 12.346548 3.437421 6.402642
## CHmRun CRBI CWalks League Division PutOuts Assists
## 14.244544 29.799855 9.483661 3.755625 1.075675 1.216958 2.437466
## Errors NewLeague
## 2.209426 3.748700
glm.fit = update(glm.fit, . ~ . -CRBI)
vif(glm.fit)
## AtBat Hits HmRun Runs RBI Walks Years
## 22.916658 24.053957 7.098632 12.097666 11.469671 3.259073 4.632125
## CHmRun CWalks League Division PutOuts Assists Errors
## 5.058320 7.508453 3.754292 1.055571 1.210333 2.435806 2.204275
## NewLeague
## 3.728251
glm.fit = update(glm.fit, . ~ . -Hits)
vif(glm.fit)
## AtBat HmRun Runs RBI Walks Years CHmRun
## 11.697745 6.161576 9.754482 10.004820 2.978310 4.614367 5.042801
## CWalks League Division PutOuts Assists Errors NewLeague
## 7.508351 3.754285 1.052612 1.202409 2.435253 2.189567 3.724498
glm.fit = update(glm.fit, . ~ . -AtBat)
vif(glm.fit)
## HmRun Runs RBI Walks Years CHmRun CWalks
## 5.094346 4.491072 6.865407 2.977274 4.573588 5.033698 7.433926
## League Division PutOuts Assists Errors NewLeague
## 3.717577 1.038009 1.150970 2.327308 2.166085 3.678469
glm.fit = update(glm.fit, . ~ . -CWalks)
vif(glm.fit)
## HmRun Runs RBI Walks Years CHmRun League
## 5.092971 4.490243 6.500651 2.320274 2.579397 3.640863 3.701809
## Division PutOuts Assists Errors NewLeague
## 1.035579 1.145814 2.317760 2.164752 3.674730
glm.fit = update(glm.fit, . ~ . -RBI)
vif(glm.fit)
## HmRun Runs Walks Years CHmRun League Division
## 3.071587 3.556694 2.315386 2.576624 3.576527 3.685153 1.035578
## PutOuts Assists Errors NewLeague
## 1.131827 2.306302 2.143195 3.638249
# Check for significant predictors
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ HmRun + Runs + Walks + Years + CHmRun +
## League + Division + PutOuts + Assists + Errors + NewLeague,
## data = Hitters.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.76046 -0.45999 0.04022 0.42359 2.87707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3136368 0.1653621 26.086 < 2e-16 ***
## HmRun -0.0041397 0.0084161 -0.492 0.62338
## Runs 0.0135172 0.0031998 4.224 3.73e-05 ***
## Walks 0.0020382 0.0031042 0.657 0.51224
## Years 0.0757923 0.0142758 5.309 3.09e-07 ***
## CHmRun 0.0018397 0.0010145 1.813 0.07137 .
## LeagueN 0.1289025 0.1679908 0.767 0.44385
## DivisionW -0.1544367 0.0887317 -1.740 0.08341 .
## PutOuts 0.0005661 0.0001816 3.117 0.00211 **
## Assists 0.0008619 0.0004971 1.734 0.08456 .
## Errors -0.0106483 0.0101328 -1.051 0.29466
## NewLeagueN 0.0178625 0.1667754 0.107 0.91482
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.3797986)
##
## Null deviance: 166.403 on 199 degrees of freedom
## Residual deviance: 71.402 on 188 degrees of freedom
## AIC: 387.58
##
## Number of Fisher Scoring iterations: 2
# Refit the model using just the significant predictors and obtain test MSE
glm.fit = update(glm.fit, . ~ Runs+Years+PutOuts)
glm.pred <- predict(object = glm.fit, newdata = Hitters.test)
err <- mean((Hitters.test$Salary - glm.pred) ^ 2)
print(sprintf("test mse: %.4f", err))
## [1] "test mse: 0.4888"
# Polynomial terms
# Carrying on with our "significant predictors", lets try some polynomial terms in the formula.
glm.fit = update(glm.fit, . ~ . + poly(Runs,6)+poly(Years,6)+poly(PutOuts,6))
# Check which predictors are now significant
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ Runs + Years + PutOuts + poly(Runs, 6) +
## poly(Years, 6) + poly(PutOuts, 6), data = Hitters.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.77884 -0.21780 0.01893 0.26334 1.30489
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3596499 0.0940270 46.366 < 2e-16 ***
## Runs 0.0122198 0.0014202 8.604 3.62e-15 ***
## Years 0.0984254 0.0066676 14.762 < 2e-16 ***
## PutOuts 0.0006039 0.0001299 4.649 6.42e-06 ***
## poly(Runs, 6)1 NA NA NA NA
## poly(Runs, 6)2 0.1950958 0.4844670 0.403 0.687643
## poly(Runs, 6)3 -1.5577288 0.4663734 -3.340 0.001017 **
## poly(Runs, 6)4 1.9045508 0.4460796 4.270 3.16e-05 ***
## poly(Runs, 6)5 -1.7152641 0.4539578 -3.778 0.000214 ***
## poly(Runs, 6)6 2.2849296 0.4793154 4.767 3.83e-06 ***
## poly(Years, 6)1 NA NA NA NA
## poly(Years, 6)2 -4.6425275 0.4613967 -10.062 < 2e-16 ***
## poly(Years, 6)3 2.3755238 0.4615639 5.147 6.86e-07 ***
## poly(Years, 6)4 1.0040153 0.4525779 2.218 0.027769 *
## poly(Years, 6)5 -0.1852489 0.4493509 -0.412 0.680638
## poly(Years, 6)6 1.5334230 0.4448595 3.447 0.000705 ***
## poly(PutOuts, 6)1 NA NA NA NA
## poly(PutOuts, 6)2 0.4063497 0.4526162 0.898 0.370496
## poly(PutOuts, 6)3 -0.3228979 0.4982753 -0.648 0.517786
## poly(PutOuts, 6)4 0.5559005 0.4871840 1.141 0.255357
## poly(PutOuts, 6)5 0.4037579 0.4646012 0.869 0.385975
## poly(PutOuts, 6)6 0.6958509 0.4822787 1.443 0.150794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1927963)
##
## Null deviance: 166.403 on 199 degrees of freedom
## Residual deviance: 34.896 on 181 degrees of freedom
## AIC: 258.39
##
## Number of Fisher Scoring iterations: 2
# Refit the model using those predictors/terms.
glm.fit = update(glm.fit, . ~ Runs+I(Runs^3)+I(Runs^4)+I(Runs^5)+I(Runs^6)+Years+I(Years^2)+I(Years^3)+I(Years^4)+I(Years^6)+PutOuts)
# Check test MSE
glm.pred <- predict(object = glm.fit, newdata = Hitters.test)
err <- mean((Hitters.test$Salary - glm.pred) ^ 2)
print(sprintf("test mse: %.4f", err))
## [1] "test mse: 0.3001"
# Interactions
# Finally, let's keep our three main predictors - but introduces interactions between them.
glm.fit = update(glm.fit, . ~ . +Runs:Years+Runs:PutOuts+Years:PutOuts)
# Check if any interactions are significant (spoiler alert: no).
summary(glm.fit)
##
## Call:
## glm(formula = Salary ~ Runs + I(Runs^3) + I(Runs^4) + I(Runs^5) +
## I(Runs^6) + Years + I(Years^2) + I(Years^3) + I(Years^4) +
## I(Years^6) + PutOuts + Runs:Years + Runs:PutOuts + Years:PutOuts,
## data = Hitters.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8352 -0.2219 0.0109 0.2932 1.9681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.957e+00 4.788e-01 10.352 < 2e-16 ***
## Runs -7.914e-02 2.325e-02 -3.404 0.000814 ***
## I(Runs^3) 8.254e-05 2.423e-05 3.406 0.000809 ***
## I(Runs^4) -1.787e-06 5.702e-07 -3.134 0.002009 **
## I(Runs^5) 1.444e-08 5.012e-09 2.881 0.004430 **
## I(Runs^6) -4.091e-11 1.536e-11 -2.664 0.008404 **
## Years 3.610e-01 2.232e-01 1.617 0.107487
## I(Years^2) 1.246e-04 5.480e-02 0.002 0.998189
## I(Years^3) -2.334e-03 5.340e-03 -0.437 0.662644
## I(Years^4) 9.025e-05 1.924e-04 0.469 0.639488
## I(Years^6) -2.253e-08 8.722e-08 -0.258 0.796480
## PutOuts 4.082e-04 4.864e-04 0.839 0.402424
## Runs:Years 4.407e-04 3.383e-04 1.303 0.194334
## Runs:PutOuts 1.189e-06 5.750e-06 0.207 0.836460
## Years:PutOuts 3.832e-06 3.081e-05 0.124 0.901140
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2521698)
##
## Null deviance: 166.403 on 199 degrees of freedom
## Residual deviance: 46.651 on 185 degrees of freedom
## AIC: 308.45
##
## Number of Fisher Scoring iterations: 2
(f) Which variables appear to be the most important predictors in the boosted model?
# Fit a boosted regression tree using the best lambda value we found earlier.
Hitters.boost <- gbm(
formula = Salary~.,
data = Hitters.train,
distribution = "gaussian",
n.trees = 1000,
shrinkage = best.shrinkage
)
# Output the summary (relative influence of variables).
summary(Hitters.boost, las = 2, xlim = c(0,26))
(g) Now apply bagging to the training set. What is the test set MSE for this approach?
Hitters.bag <- randomForest(
formula = Salary~.,
data = Hitters.train,
mtry = length(Hitters)-1
)
Hitters.pred <- predict(
object = Hitters.bag,
newdata = Hitters.test
)
sprintf("test mse: %.4f", mean((Hitters.pred - Hitters.test$Salary) ^ 2))
## [1] "test mse: 0.2284"
detach(Hitters)
clear_workspace()
This question uses the Caravan data set.
# Creating a dummy variable, since otherwise get the following error in gbm fit:
# "Error in gbm.fit Bernoulli requires the response to be in {0,1}". Alternative
# would be to use e.g. unclass(Purchase)-1 ~ 1 as the formula, but for the sake of clarity..
PurchaseY <- ifelse(Caravan$Purchase == 'Yes', 1, 0)
Caravan <- data.frame(Caravan, PurchaseY)
rm(PurchaseY)
attach(Caravan)
(a) Create a training set consisting of the first 1,000 observations, and a test set consisting of the remaining observations.
train <- 1:1000
Caravan.train <- Caravan[train,]
Caravan.test <- Caravan[-train,]
Caravan.test.numrows <- nrow(Caravan.test)
(b) Fit a boosting model to the training set with Purchase as the response and the other variables as predictors. Use 1,000 trees, and a shrinkage value of 0.01. Which predictors appear to be the most important?
# Fit a boosted classification tree.
# Note that we get some warnings about variables having no variation;
# presumably e.g. AVRAAUT is all one value *in our training split*
# We could try another split, but this would likely bias the results -
# plus the instructions explicitly state that the first 1000 observations
# should be the training set.
Caravan.boost <- gbm(
formula = PurchaseY~.-Purchase,
data = Caravan.train,
distribution = "bernoulli", # as response is categorical
n.trees = 1000,
shrinkage = 0.01
)
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x, y, offset = offset, distribution = distribution, w =
## w, : variable 71: AVRAAUT has no variation.
# Output the summary (relative influence of variables).
summary(Caravan.boost, plotit=FALSE)
(c) Use the boosting model to predict the response on the test data. Predict that a person will make a purchase if the estimated probability of purchase is greater than 20%. Form a confusion matrix. What fraction of the people predicted to make a purchase do in fact make one? How does this compare with the results obtained from applying KNN or logistic regression to this data set?
# Boosting
Caravan.pred <- predict(
object = Caravan.boost,
newdata = Caravan.test,
n.trees = 1000,
type = "response"
)
Caravan.pred.purchase <- rep("No", Caravan.test.numrows)
Caravan.pred.purchase[Caravan.pred > .2] <- "Yes"
# output the confusion matrix
matrix = table(
"Predicted" = Caravan.pred.purchase,
"Actual" = Caravan.test$Purchase
)
print(matrix)
## Actual
## Predicted No Yes
## No 4409 258
## Yes 124 31
# output the fraction of predicted purchasers, who actually purchase
sprintf("fraction of Yes predictions which were correct: %.4f",
matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] "fraction of Yes predictions which were correct: 0.2000"
# Logistic Regression
# Fit the model.
# Note that we get some warnings about "fitted probabilities numerically 0 or 1 occurred";
# again, presumably Purchase is always either Yes or No with respect to one or more of our
# predictor variables - resulting in them being given unreasonably large weightings (per
# discussion at https://www.r-bloggers.com/learn-logistic-regression-and-beyond).
Caravan.glm.fit <- glm(
# not using bernoulli, so can use regular Purchase variable as predictor.
formula = Purchase ~ . -PurchaseY,
data = Caravan.train,
family = binomial,
subset = train
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Per the warnings, we should expect to see that certain variables have been assigned
# much larger coeffificients than the others. ABESAUT, for example (173.037773070).
coef(Caravan.glm.fit)
## (Intercept) MOSTYPE MAANTHUI MGEMOMV MGEMLEEF
## 256.111613608 -0.018137836 0.071310327 -0.929793800 0.041869025
## MOSHOOFD MGODRK MGODPR MGODOV MGODGE
## 0.139981328 -0.723032025 -0.320095654 -0.649867202 -0.277324784
## MRELGE MRELSA MRELOV MFALLEEN MFGEKIND
## 0.798945489 0.691674967 0.876189582 -0.370222996 -0.298613883
## MFWEKIND MOPLHOOG MOPLMIDD MOPLLAAG MBERHOOG
## -0.053447705 -0.434233684 -0.693582737 -0.511762343 0.108009835
## MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO
## -0.273724128 -0.578837395 0.419310171 0.278734998 0.351076566
## MSKA MSKB1 MSKB2 MSKC MSKD
## 0.359087294 0.143421675 0.178304030 0.109316099 -0.386861444
## MHHUUR MHKOOP MAUT1 MAUT2 MAUT0
## -15.683220360 -15.614447804 0.423286795 0.430429974 0.225560470
## MZFONDS MZPART MINKM30 MINK3045 MINK4575
## -13.758701896 -13.768602205 0.112284354 0.092553590 0.260611332
## MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART
## 0.360128618 -0.140712551 -0.364332347 0.232451203 0.934257620
## PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO
## -0.905582577 -17.518973254 0.375731995 -37.923562822 0.222956221
## PVRAAUT PAANHANG PTRACTOR PWERKT PBROM
## NA 15.732379673 -0.332629103 16.761744979 -0.109445550
## PLEVEN PPERSONG PGEZONG PWAOREG PBRAND
## 0.240508000 1.268522655 19.997061590 14.524192278 0.033690501
## PZEILPL PPLEZIER PFIETS PINBOED PBYSTAND
## 37.381664743 0.637109636 21.014603746 0.569945986 0.416706179
## AWAPART AWABEDR AWALAND APERSAUT ABESAUT
## -1.474801231 -14.172998132 54.155870772 -0.657163782 173.037773070
## AMOTSCO AVRAAUT AAANHANG ATRACTOR AWERKT
## -0.114337560 NA -30.987605267 -15.450929240 NA
## ABROM ALEVEN APERSONG AGEZONG AWAOREG
## -0.251095198 -0.860027186 -18.787781645 -57.625956606 -31.901920342
## ABRAND AZEILPL APLEZIER AFIETS AINBOED
## 0.572476509 NA 1.083398968 -19.285105277 -19.789292840
## ABYSTAND
## -0.005834623
high.coef = na.omit(coef(Caravan.glm.fit)[coef(Caravan.glm.fit) > 30])
high.coef[2:length(high.coef)]
## PZEILPL AWALAND ABESAUT
## 37.38166 54.15587 173.03777
# Make predictions (probabilities) on the test data.
Caravan.pred <- predict(
object = Caravan.glm.fit,
newdata = Caravan.test,
type = "response"
)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Convert our probabilities into No/Yes responses, with 20% threshold.
Caravan.pred.purchase <- rep("No", Caravan.test.numrows)
Caravan.pred.purchase[Caravan.pred > .2] <- "Yes"
# output the confusion matrix
matrix = table(
"Predicted" = Caravan.pred.purchase,
"Actual" = Caravan.test$Purchase
)
print(matrix)
## Actual
## Predicted No Yes
## No 4183 231
## Yes 350 58
# output the fraction of predicted purchasers, who actually purchase
sprintf("fraction of Yes predictions which were correct: %.4f",
matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] "fraction of Yes predictions which were correct: 0.1422"
# KNN
# Eyeballing the data, it's possible that we don't need to standardize the values -
# since they all seem to be in the same range already. However just to be sure..
maxk <- 100
standardized.X <- scale(Caravan[, -c(86,87)])
train.X <- standardized.X[train,]
test.X <- standardized.X[-train,]
train.Y <- Caravan.train$Purchase
test.Y <- Caravan.test$Purchase
best_knn(maxk, train.X, test.X, train.Y, test.Y)
## [1] "best k = 11"
## [1] "percentage correct = 94.01"
## [1] "prediction error = 0.0599"
## Actual
## Predicted No Yes
## No 4533 289
## Yes 0 0
# OK, KNN predicted zero of the 289 actual Purchases. This is probably because the
# error being minimised is "knn.pred != test.Y" e.g. classification errors overall,
# rather than just false positives. Let's try again, only this time minimise the
# number of false positives.
err = vector(mode = "numeric", length = )
for (k in 1 : maxk) {
knn.pred <- knn(
train = train.X,
test = test.X,
cl = train.Y,
k = k
)
pred.y <- knn.pred[knn.pred == 'Yes']
err[k] <- mean(knn.pred[pred.y] != test.Y[pred.y])
}
# Find the value of k which had the minimum error.
bestk <- which.min(err)
sprintf("best k = %d", bestk)
## [1] "best k = 1"
# Now retrain the model with the best value of k.
knn.pred <- knn(
train = train.X,
test = test.X,
cl = train.Y,
k = bestk
)
# output the confusion matrix
matrix = table(
"Predicted" = knn.pred,
"Actual" = test.Y
)
print(matrix)
## Actual
## Predicted No Yes
## No 4249 252
## Yes 284 37
# output the fraction of predicted purchasers, who actually purchase
print(matrix[2,2] / (matrix[2,1] + matrix[2,2]))
## [1] 0.1152648
clear_workspace()
detach(Caravan)
Build a Ridge and a Lasso model for the problem considered in Excercise 10 from Chapter 8 of Tibshirani’s ISLR book, with particular values of the models’ parameters chosen by you. Then tune/optimise the models with cross validation. Compare your optimised Ridge and Lasso models with the models you developed in point (e) of the above mentioned exercise, and choose the best one justifying your choice.
# See below. The model with the lowest test MSE (0.3001) from (e) above - the model with 6th order polynomial terms - resulted in both Lasso and Ridge models having a higher test MSE (0.3872 and 0.4069 respectively). This seems surprising, however given that both Ridge and Lasso punish complexity, and given that the success of this model on the test data is likely have been due to its complexity, then perhaps it is an indicator that this is not a good model.
Alternatively, using all predictors, both Lasso and Ridge had marginally lower test MSE than the best subsets model from (e) above; 0.4714 and 0.4554, respectively - vs the earlier 0.4888.
In terms of choosing one of the three as the best linear model, then irrespective of the lowest test MSE I would pick ridge. The model is already reasonably sparse, reducing the value of lasso over it; as can be seen from the bar chart of Ridge coefficients (using all predictors), it arrives at a similar number of “main” variables as the other two. Best subsets may sound like a better alternative (since we have a low enough number of predictors to allow a brute-force approach) however I suspect that ridge - being by design a guard against over-fitting - might cope better with unseen data, if we really did have a test (as opposed to a validation) set.# Using model with best test MSE from earlier
# Once again remove NAs and log-transform Salary. Note we don't need to set a seed here,
# e.g. to re-create the state of our data from up the page, since we're not using any randomness.
Hitters <- na.omit(Hitters)
Hitters$Salary = log(Hitters$Salary)
attach(Hitters)
# Create a model matrix of our predictors (using the best combination of predictors/polynomials/interactions
# found in (e) above.
x <- model.matrix(Salary ~ Runs+I(Runs^3)+I(Runs^4)+I(Runs^5)+I(Runs^6)+Years+I(Years^2)+I(Years^3)+I(Years^4)+I(Years^6)+PutOuts)[,-1]
# Set up our training/test split, with the training set being the first 200 observations.
train <- 1:200
y <- Hitters$Salary
x.train <- x[train,]
x.test <- x[-train,]
y.train <- y[train]
y.test <- y[-train]
# Ridge Regression
# Use cross-validation to find the best value of lambda (for ridge regression) given our training set.
Hitters.cv.ridge <- cv.glmnet(
x = x.train,
y = y.train,
alpha = 0 # specify ridge regression
)
Hitters.ridge.bestlam <- Hitters.cv.ridge$lambda.min
sprintf("Ridge best lambda: %.4f", Hitters.ridge.bestlam)
## [1] "Ridge best lambda: 0.0543"
plot(Hitters.cv.ridge)
# Predict salaries based on this model.
Hitters.ridge <- glmnet(
x = x.train,
y = y.train,
alpha = 0
)
Hitters.pred <- predict(
object = Hitters.ridge,
s = Hitters.ridge.bestlam,
newx = x.test
)
# Calculate the test MSE.
sprintf("Ridge test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Ridge test MSE: 0.4069"
# Inspect the coefficients from our best ridge model.
coefs <- predict(
object = Hitters.ridge,
type = "coefficients",
s = Hitters.ridge.bestlam
)[2:12,]
# Output the coefficients as a data frame.
coefs.frame <- data.frame("var" = names(data.frame(x)), "coef" = coefs)
coefs.frame[order(coefs.frame$var),]
# Also plot the coefficients as a bar chart.
print(
ggplot(
data = coefs.frame,
aes(
x = var,
y = coef
)
) +
geom_bar(
stat="identity"
) +
theme(
axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
)
)
# Lasso
Hitters.cv.lasso <- cv.glmnet(
x = x.train,
y = y.train,
alpha = 1 # lasso
)
Hitters.lasso.bestlam <- Hitters.cv.lasso$lambda.min
sprintf("Lasso best lambda: %.4f", Hitters.lasso.bestlam)
## [1] "Lasso best lambda: 0.0033"
plot(Hitters.cv.lasso)
# Predict salaries based on this model.
Hitters.lasso <- glmnet(
x = x.train,
y = y.train,
alpha = 1
)
Hitters.pred <- predict(
object = Hitters.lasso,
s = Hitters.lasso.bestlam,
newx = x.test
)
# Calculate the test MSE.
sprintf("Lasso test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Lasso test MSE: 0.3872"
# Retrieve the coefficients.
coefs <- predict(
object = Hitters.lasso,
type = "coefficients",
s = Hitters.lasso.bestlam
)[2:12,]
# Output those coefficients which haven't been shrunk to zero.
coefs[abs(coefs) > 0]
## Runs I(Runs^3) I(Runs^6) Years I(Years^2)
## 1.270983e-02 7.462048e-08 -5.429968e-14 2.870710e-01 -8.371021e-07
## I(Years^3) I(Years^6) PutOuts
## -7.985313e-04 3.410067e-08 5.717087e-04
# Using all predictors
x <- model.matrix(Salary ~ ., data = Hitters)[,-1]
train <- 1:200
y <- Hitters$Salary
x.train <- x[train,]
x.test <- x[-train,]
y.train <- y[train]
y.test <- y[-train]
# Ridge Regression
# Use cross-validation to find the best value of lambda (for ridge regression) given our training set.
Hitters.cv.ridge <- cv.glmnet(
x = x.train,
y = y.train,
alpha = 0 # specify ridge regression
)
Hitters.ridge.bestlam <- Hitters.cv.ridge$lambda.min
sprintf("Ridge best lambda: %.4f", Hitters.ridge.bestlam)
## [1] "Ridge best lambda: 0.1336"
# Predict salaries based on this model.
Hitters.ridge <- glmnet(
x = x.train,
y = y.train,
alpha = 0
)
Hitters.pred <- predict(
object = Hitters.ridge,
s = Hitters.ridge.bestlam,
newx = x.test
)
# Calculate the test MSE.
sprintf("Ridge test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Ridge test MSE: 0.4554"
# Inspect the coefficients from our best ridge model.
coefs <- predict(
object = Hitters.ridge,
type = "coefficients",
s = Hitters.ridge.bestlam
)[2:20,]
# Output the coefficients as a data frame.
coefs.frame <- data.frame("var" = names(data.frame(x)), "coef" = coefs)
coefs.frame[order(coefs.frame$var),]
# Also plot the coefficients as a bar chart.
print(
ggplot(
data = coefs.frame,
aes(
x = var,
y = coef
)
) +
geom_bar(
stat="identity"
) +
theme(
axis.text.x = element_text(angle=90, vjust=0.1, hjust=1)
)
)
# Lasso - Using model with best test MSE from earlier
# Now use cross-validation to find the best value of lambda - for lasso - given our training data.
Hitters.cv.lasso <- cv.glmnet(
x = x.train,
y = y.train,
alpha = 1 # lasso
)
Hitters.lasso.bestlam <- Hitters.cv.lasso$lambda.min
sprintf("Lasso best lambda: %.4f", Hitters.lasso.bestlam)
## [1] "Lasso best lambda: 0.0029"
# Predict salaries based on this model.
Hitters.lasso <- glmnet(
x = x.train,
y = y.train,
alpha = 1
)
Hitters.pred <- predict(
object = Hitters.lasso,
s = Hitters.lasso.bestlam,
newx = x.test
)
# Calculate the test MSE.
sprintf("Lasso test MSE: %.4f", mean((Hitters.pred - y.test) ^ 2))
## [1] "Lasso test MSE: 0.4714"
# Retrieve the coefficients.
coefs <- predict(
object = Hitters.lasso,
type = "coefficients",
s = Hitters.lasso.bestlam
)[2:20,]
# Output those coefficients which haven't been shrunk to zero.
coefs[abs(coefs) > 0]
## AtBat Hits HmRun Runs RBI
## -0.0028300893 0.0133732790 0.0029395173 0.0011132052 -0.0005648055
## Walks Years CHmRun CRuns CWalks
## 0.0085259992 0.0626866539 0.0014680812 0.0008631622 -0.0008873527
## LeagueN DivisionW PutOuts Assists Errors
## 0.1222336365 -0.1441197451 0.0005032325 0.0007726510 -0.0093837816
## NewLeagueN
## -0.0111156444